[Deep_Learning]Word_Embedding(Eng)

FastText wiki pre_trained_model

In [1]:
from gensim.test.utils import common_texts
from gensim.models import FastText
In [2]:
ft_model = FastText(common_texts, size=4, window=3, min_count=1, iter=10)
In [3]:
ft_model.init_sims(replace=True)
In [4]:
common_texts[:5]
Out[4]:
[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time']]
In [5]:
similarities = ft_model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
In [6]:
most_similar = similarities[0]
In [7]:
print(most_similar)
('graph', 0.344123899936676)
In [8]:
not_matching = ft_model.wv.doesnt_match("human computer interface tree".split())
C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
In [9]:
print(not_matching)
tree
In [10]:
sim_score = ft_model.wv.similarity('computer', 'human')
In [11]:
print(sim_score)
0.80814165
In [12]:
sim_score = ft_model.wv.similarity('computer', 'interface')
In [13]:
print(sim_score)
0.5414237
In [14]:
from gensim.test.utils import common_texts
from gensim.models import word2vec
In [15]:
# 파라메터값 지정
num_features = 300 # 문자 벡터 차원 수
min_word_count = 40 # 최소 문자 수(10~100 사이가 적당. 어휘의 크기를 의미 있는 단어로 제한하는데 도움을 줌.)
num_workers = 4 # 병렬 처리 스레드 수(병렬 처리 스래드)
context = 10 # 문자열 창 크기(고려해야 할 주변 단어의 수)
downsampling = 1e-3 # 문자 빈도수 Downsample(구글문서는 0.00001에서 0.001사이의 값을 권장)
In [16]:
# 모델 학습
model = word2vec.Word2Vec(common_texts, min_count=1)
In [17]:
model.init_sims(replace=True)
In [18]:
common_texts[:5]
Out[18]:
[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time']]
In [19]:
similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
In [20]:
most_similar = similarities[0]
In [21]:
print(most_similar)
('trees', 0.17821332812309265)
In [22]:
not_matching = model.wv.doesnt_match("human computer interface tree".split())
In [23]:
print(not_matching)
interface
In [24]:
sim_score = model.wv.similarity('computer', 'human')
In [25]:
print(sim_score)
0.04529048
In [26]:
sim_score = model.wv.similarity('computer', 'interface')
In [27]:
print(sim_score)
-0.007194359
In [28]:
import itertools
from gensim.test.utils import common_texts
from glove import Corpus, Glove
In [29]:
corpus = Corpus()
In [30]:
corpus.fit(common_texts, window=10)
In [31]:
glove = Glove(no_components=100, learning_rate=0.05)
In [32]:
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
In [33]:
glove.add_dictionary(corpus.dictionary)
In [34]:
similarities = ft_model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
In [35]:
similarities = glove.most_similar('computer')
In [36]:
most_similar = similarities[0]
In [37]:
print(most_similar)
('user', 0.19786983792912138)
In [ ]: